We will use ggplot2 package.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
UsingR packages has the Pearson’s data from ca. 1900.
#install.packages("UsingR")
library(UsingR)
## Warning: package 'UsingR' was built under R version 4.4.3
## Loading required package: MASS
## Loading required package: HistData
## Warning: package 'HistData' was built under R version 4.4.3
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 4.4.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
data(father.son)
head(father.son)
## fheight sheight
## 1 65.04851 59.77827
## 2 63.25094 63.21404
## 3 64.95532 63.34242
## 4 65.75250 62.79238
## 5 61.13723 64.28113
## 6 63.02254 64.24221
Now let us build the scatter plot of fathers and sons’ height. Father’s height on x-axis and son’s height on y axis.
ggplot(data = father.son,
mapping = aes(x = fheight, y = sheight)) +
geom_point()
Add alpha = 0.4 to see through the points (they will be
60% transparent).
ggplot(data = father.son,
mapping = aes(x = fheight, y = sheight)) +
geom_point(alpha = 0.4)
The descriptions of the axes.
ggplot(data = father.son,
mapping = aes(x = fheight, y = sheight)) +
geom_point(alpha = 0.4, col="blue") +
xlab("Father's height [inches]") +
ylab("Son's height [inches]") +
ggtitle("Scatter plot of Pearson's father/son data")
Let us colour the dots where fathers where ~70 inches tall.
# Auxiliary table with the families where rounded height of the father is 70 inches
df70 <- father.son[round(father.son$fheight)==70, ]
ggplot(data = father.son,
mapping = aes(x = fheight, y = sheight)) +
geom_point(alpha = 0.4, col="black") +
geom_point(data = df70, alpha = 0.4, col = "blue") +
xlab("Father's height [inches]") +
ylab("Son's height [inches]") +
ggtitle("Scatter plot of Pearson's father/son data")
Now, colour the dots where fathers where ~67 inches tall (in green) and ~64 inches tall (in yellow).
# Auxiliary tables
df67 <- father.son[round(father.son$fheight)==67, ]
df64 <- father.son[round(father.son$fheight)==64, ]
G <- ggplot(data = father.son,
mapping = aes(x = fheight, y = sheight)) +
geom_point(alpha = 0.4, col="black") +
geom_point(data = df70, alpha = 0.4, col = "blue") +
geom_point(data = df67, alpha = 0.4, col = "chartreuse4") +
geom_point(data = df64, alpha = 0.4, col = "yellow2") +
xlab("Father's height [inches]") +
ylab("Son's height [inches]") +
ggtitle("Scatter plot of Pearson's father/son data")
G
What is the average height of the son if father’s height is around (a) 64, (b) 67, (c) 70 inches. Store the results in variables called avgs_64, avgs_67, avgs_70. Add the points and the regression line (linear regression) to plot.
# mean(father.son$sheight[round(father.son$fheight)==64])
(avgs_64 <- mean(df64$sheight))
## [1] 66.70139
(avgs_67 <- mean(df67$sheight))
## [1] 68.13977
(avgs_70 <- mean(df70$sheight))
## [1] 69.76845
G +
geom_point(data = data.frame(x=64, y=avgs_64), aes(x = x, y = y),
col = "orange", size = 4) +
geom_point(data = data.frame(x=67, y=avgs_67), aes(x = x, y = y),
col = "darkgreen", size = 4) +
geom_point(data = data.frame(x=70, y=avgs_70), aes(x = x, y = y),
col = "blue", size = 4) +
geom_smooth(method = "lm", col="red")
## `geom_smooth()` using formula = 'y ~ x'
Boxplots: let’s have two boxplots in one chart for the fathers’
height and sons’ height. Horizontal. Use ggplot.
Reshape the data:
# Reshape to long format
library(dplyr)
library(tidyr)
df_long <- father.son %>%
pivot_longer(cols = c(fheight, sheight),
names_to = "group",
values_to = "height") %>%
mutate(group = recode(group,
fheight = "Father",
sheight = "Son"))
head(df_long)
## # A tibble: 6 × 2
## group height
## <chr> <dbl>
## 1 Father 65.0
## 2 Son 59.8
## 3 Father 63.3
## 4 Son 63.2
## 5 Father 65.0
## 6 Son 63.3
ggplot(df_long, aes(x = height, y = group, fill = group)) +
geom_boxplot() +
labs(y = "", x = "Height (inches)", title = "Father vs Son Heights")
You can use similar ggplot structure to generate violin plots. Just
put geom_violin instead of geom_boxplot:
ggplot(df_long, aes(x = height, y = group, fill = group)) +
geom_violin() +
labs(y = "", x = "Height (inches)", title = "Father vs Son Heights")
Recommended book on visualization:
Fundamentals of Data Visualization
Github page of the book with codes in R
Task: try to create a “Strip chart” using the last example (boxplots and violins) as a starting point.
ggplot(df_long, aes(x = height, y = group, colour = group)) +
geom_jitter(width = 0, height = 0.2, alpha = 0.4) +
labs(y = "", x = "Height (inches)", title = "Father vs Son Heights")
Another idea: “plotly” wrapper enabling some interactivity.
#install.packages("plotly")
library(plotly)
ggplotly(G)